\documentclass[final]{report}
\usepackage[a4paper, top=3cm, bottom=3cm]{geometry}
\usepackage{amsmath}
\usepackage{color}

\definecolor{gray}{rgb}{0.5,0.5,0.5}
\definecolor{commentgreen}{rgb}{0.1,0.6,0.1}

% Glenn added for code supprt
\usepackage{listings}
\lstset{
language=R,
basicstyle=\scriptsize\ttfamily,
commentstyle=\ttfamily\color{commentgreen},
numbers=left,
numberstyle=\ttfamily\color{gray}\footnotesize,
stepnumber=1,
numbersep=5pt,
backgroundcolor=\color{white},
showspaces=false,
showstringspaces=false,
showtabs=false,
frame=single,
tabsize=2,
captionpos=b,
breaklines=true,
breakatwhitespace=false,
title=\lstname,
escapechar=@,
keywordstyle={},
morekeywords={}
}


% Turn off all indenting:
\setlength{\parindent}{0cm}

%Add all necessary variable commands here
%\newcommand{}{{}}

% Probability
\newcommand{\Probability}{{P}}
\newcommand{\Event}{{E}}
\newcommand{\Sample}{{S}}
\newcommand{\Expected}{{E}}
\newcommand{\Variance}{{V}}
\newcommand{\StandardDev}{\sigma}
\newcommand{\Correlation}{\rho}
\newcommand{\Mean}{\mu}
\newcommand{\PopProportion}{{p}}
\newcommand{\NullHypothesis}{H_{0}}
\newcommand{\AltHypothesis}{H_{a}}
\newcommand{\HypothesisVal}{\theta_0}

\title{Data Analysis and Statistics}
\author{Alexandra Booth \and Glenn Sweeney}
\date{Practical - Tuesday, January 14th@ 2pm \\ Written - Wednesday January 15th @ 9am}

\begin{document}

\maketitle
\newpage

% Build the table of contents
\tableofcontents

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Statistics}

% Variable definitions section
\section{Variable Definitions}

% Put definitions of all variables used here, in the following format:
% <variable command>: <name in words> - Definition

$\Probability$: Probability - The ratio of the number of outcomes favorable to an event $\Event$ \\
$\Event$: Event - any collection of outcomes from the sample space of a chance experiment \\
$\Sample$: Sample Space - sample space for an experiment \\
X: Numerical Variable - used as a random variable \\
x: Value in a Set - all the possible values in the set make up a random variable \emph{X}\\
p(x): Probability Distribution of a Discrete Random Variable $\Rightarrow$ P(X = x) \emph{or} p(x) \\
$\Expected(X)$: Expected Value - value of a discrete random variable \emph{X} describing the mean\\
$\Variance(X)$ \emph{or} $\sigma_{X}^{2}$ : Variance of X -  a measure of how far a set of numbers is spread out; the standard deviation squared \\
$\StandardDev_{X}$ \emph{or} $\sqrt{\Variance(X)}$ : Standard Deviation of X - how much variation or dispersion from the average exists; the square root of variance\\
$\Correlation$: Correlation Coefficient - measures the linear dependence between two random variables \\
$f(x)$: Density Function - function that defines a density curve; describes the probability for a continuous random variable X \\
$\Mean$: Mean - average \\
$\PopProportion$: population proportion \\
$A$: value of the bound on the error for a confidence interval; used when determining an adequate sample size \\
$\NullHypothesis$: Null Hypothesis - a claim about a population characteristic, $\theta$, that is initially assumed to be true\\
$\AltHypothesis$: Alternative Hypothesis - the competing claim agains $\NullHypothesis$ \\
$\HypothesisVal$: Hypothesis Value - the a specific number determined by the problem context \\
$\alpha$: Significance Level (Hypothesis Error Type I) - error in Hypothesis test of rejecting $\NullHypothesis$ when $\NullHypothesis$ is true \\
$beta$: Hypothesis Error Type II - error in Hypothesis test of failing to rejecting $\NullHypothesis$ when $\NullHypothesis$ is false \\

\section{Mathematical Symbols}
$\forall$: For all \\
$\prod$: product series \\
\begin{equation}
\prod_{i=1}^{3} \Probability(\Event_{i}) \Rightarrow (\Probability(\Event_{1})) (\Probability(\Event_{2})) (\Probability(\Event_{3})) \nonumber
\end{equation}
$\in$: in - an example is $ x \in D $ reads \emph{ values x in set D }\\
$\equiv$: equivalent \\ $\bigl(\begin{smallmatrix} n\\ x \end{smallmatrix} \bigr)$: $n$ ``chooses" $x$ - represents the number of ways of choosing $x$ items from a set of $n$; See Binomial Distribution\\

\section{Concept Definitions}

\subsubsection{Probabilities}

\textbf{\emph{chance experiment}} - any activity or situation in which there is uncertainty about which of the two or more possible outcomes will result \\
\textbf{\emph{sample space}} - collection of all possible outcomes of a chance experiment \\
\textbf{\emph{simple event}} - and event with exactly one outcome \\
\textbf{\emph{disjoint or mutually excusive events}} - two events that have no common outcomes \\

\subsubsection{Random Variables and Probability Distribution}

\textbf{\emph{random variable}} - a numerical variable \emph{X} whose value depends on the outcome of a chance experiment; associates a numerical value with each outcome of a chance experiment \\
\textbf{\emph{discrete random variable}} - random variable \emph{X} where the set of possible values \emph{x} are a collection of issolated points along the number line (ex: pages in a book) \\
\textbf{\emph{continuous random variable}} - random variable \emph{X} where the set of possible values \emph{x} are an entire interval along the number line (ex: life of a lightbulb) \\
\textbf{\emph{probability distribution of a discrete random variable}} - Where \emph{X} gives the probability assiciated with \emph{each} possible \emph{x} value; Denoted as P(X = x) \emph{or} p(x) \\
\textbf{\emph{probability histogram}} - pictorial representation of a discrete probability distribution with axes as \emph{x vs. p(x)}\\
\textbf{\emph{density curve}} - probability distribution for a continuous random variable X; defines by desity function f(x) \\
\textbf{\emph{bernouilli distribution}} - discrete probability distribution for dichotomous observations; a success takes a value of 1 where failure takes a value of 0 \\
\textbf{\emph{binomial distribution}} - discrete probability distribution for dichotomous observations \\
\textbf{\emph{geometric distribution}} - discrete probability distribution for dichotomous observations \\
\textbf{\emph{trial}} - when a dichotomous observation (only two possible values may occur) is made \\
\textbf{\emph{standard normal distribution}} - normal distribution with $\Mean = 0$ and $\StandardDev = 1$; often denoted as Z for \emph{z-curve} \\

\subsubsection{Inferential Statistics}

\textbf{\emph{unbiased statistic}} - A statistic whos mean value is equal to the value of the population characteristic being estimated\\
\textbf{\emph{confidence interval}} - an interval, $[a,b]$, for a population characteristic, $\theta$, of plausible values for the characteristic \\
\textbf{\emph{confidence level}} - sucess rate of the method used to construct the interval, 1 - $\alpha$ \\

%%%%%%%%%%%%%%%%%%%
\section{Probablities}

There are four types of events: (\emph{not A}), (\emph{A or B}), (\emph{A and B}), (\emph{disjoint}).
The mathematical notation:

\begin{eqnarray}
\text{not A} &:&  \overline{A} \nonumber \\
\text{A and B} &:& A \wedge B \nonumber \\
\text{A or B} &:&  A \vee B \nonumber
\end{eqnarray}

If events are disjoint $\Probability(A) + \Probability(B) = 1$ \\

The probability of an event \emph{that are equally likely} can be written as:

\begin{equation}
\Probability(\Event) = \frac{\text{number of outcomes favoarable to \emph{\Event}}}{\text{number of outcomes in the sample space \emph{S}}}
\end{equation}

Probability can also be written in terms of the relative frequency of occurence in a large series of trials. Because of chance, the \textbf{relative frequency} of occurance for any event will differ from the \textbf{true probability}. $\Probability$($\Event$) will approach $\Event$ as the number of trials increases because the difference between the relative frequency and the true probability approaches zero (\emph{Law of Large Numbers}).

\begin{equation}
\Probability(\Event) \approx \frac{\text{number of times \emph{\Event} occurs}}{\text{number of trials}}
\end{equation}

\textbf{\emph{Comparing and Combining Sets}}:

\begin{eqnarray}
\Event_{1} \cup \Event_{2} &:& \text{union of sets} - \text{the new set contains \emph{all} elements of both events} \nonumber  \\
\Event_{1} \cap \Event_{2} &:& \text{intersection of sets} - \text{the new set contains only elements appearing in \emph{both} events} \nonumber \\
\Event_{1} - \Event_{2} &:& \text{difference of sets} - \text{the new set contains all elements of first set so long as} \nonumber \\
&& \text{they do not exist in the second} \nonumber
\end{eqnarray}

\textbf{\emph{Basic Properties of Probability}}:

\begin{enumerate}
  \item For any event $\Event$, 0 $\le \Probability(\Event) \le$ 1
  \item If $\Sample$ is the sample space for an experiment, $\Probability{\Sample} = 1$ 
  \item If two events ($\Event_{1}$ and $\Event_{2}$) are \emph{disjoint} then:
  \begin{enumerate}
    \item $\Probability(\Event_{1} $or$ \Event_{2})  = \Probability(\Event_{1} \cup \Event_{2}) = \Probability(\Event_{1}) + \Probability(\Event_{2})$
 \end{enumerate}
\item Otherwise two events ($\Event_{1}$ and $\Event_{2}$) are:
\begin{enumerate}
    \item $\Probability(\Event_{1} $or$ \Event_{2})  = \Probability(\Event_{1} \cup \Event_{2}) = \Probability(\Event_{1}) + \Probability(\Event_{2}) - \Probability(\Event_{1} \cap \Event_{2})$
  \end{enumerate}
  \item For any event $\Event, \Probability(\Event) + \Probability(\overline{\Event}) = 1$
  \item $\Probability(\overline{\Event_{1}} \cup \overline{\Event_{2}}) = \Probability(\overline{\Event_{1} \cap \Event_{2}})$
  \item $\Probability(\overline{\Event_{1}} \cap \overline{\Event_{2}}) = \Probability(\overline{\Event_{1} \cup \Event_{2}})$
\end{enumerate}

\textbf{\emph{The Conditional Property}}:
This is the probability of event $\Event_{1}$ given that the event $\Event_{2}$ has occured. It is denoted with the `` $\mid$ " character. 

\begin{equation}
\Probability(\Event_{1} \mid \Event_{2}) = \frac{\Probability(\Event_{1} \cap \Event_{2})}{\Probability(\Event_{2})}
\label{eqt:conditional}
\end{equation}

\textbf{\emph{The Independent Property}}:
$\Event_{1}$ and $\Event_{2}$ are independent if:

\begin{eqnarray}
\Probability(\Event_{1} \mid \Event_{2}) &=& \Probability(\Event_{1}) \\
\nonumber \\
\text{ and vise versa } && \nonumber \\
\nonumber \\
\Probability(\Event_{2} \mid \Event_{1}) &=& \Probability(\Event_{2}) \nonumber \\
\nonumber \\
\text{ for the case of independence among numerous events} &:& (\Event_{1}, \Event_{2}, ... , \Event_{k}) \nonumber \\
\nonumber \\
\Probability(\Event_{1} \cap \Event_{2} \cap ... \cap \Event_{k}) = \prod_{i=1}^{k} \Probability(\Event_{i})
\end{eqnarray}

\textbf{\emph{The Law of Total Probability}}: Only exists for \emph{disjoint} events $B_{1}$, $B_{1}$, ... , $B_{k}$. The \emph{Conditional Property} (Equation \ref{eqt:conditional}) can be used to substitute  $\Probability(\Event \cap B_{i})$ for $\Probability(\Event \mid B_{i}) \Probability(B_{i})$

\begin{equation}
\Probability(\Event) = \sum_{i=1}^{k} \Probability(\Event \cap B_{i})
\end{equation}

\textbf{\emph{Bayes' Rule}}: Only exists for \emph{disjoint} events $B_{1}$, $B_{2}$, ... , $B_{k}$. The probability for any one of the disjoint events given the event $\Event$. $i$ denotes any value between 1 and $k$ of the disjoint event series.

\begin{equation}
\Probability(B_{i} \mid \Event) = \frac{\Probability(\Event \mid B_{i}) \Probability(B_{i})}{\sum_{j=1}^{k} \Probability(\Event \mid B_{j}) \Probability(B_{j})}
\end{equation}

%%%%%%%%%%%%%%%%%%%
\section{Random Variables and Probability Distribution}

\textbf{\emph{ Properties of Probability Distributions}}:

\begin{enumerate}
 \item Discrete
\begin{enumerate}
  \item For every possible \emph{x} value, $0 \le p(x) \le 1$.
  \item $ \sum_{\text{all x values}} p(x) = 1$
\end{enumerate}
\item Continuous
\begin{enumerate}
  \item $f(x) \ge 0$
\item $\int_{+ \infty}^{- \infty} \! f(x) \, \mathrm{d}x$ = 1
\item The probability of $X$ between any interval is simply the area of the curve set by the interval
\end{enumerate}
\end{enumerate}

\textbf{\emph{Expected Value of a Random Variable, E(X), and Its Properties}}: \\

\emph{Discrete}

\begin{equation}
\Expected(X) = \sum_{\text{all possible x values}} x \times p(x)
\end{equation}

\emph{Continuous}

\begin{equation}
\Expected(X) = \int_{+ \infty}^{- \infty} \! x \times f(x) \, \mathrm{d}x 
\end{equation}

\emph{General Properties of E(X)}:

\begin{enumerate}
  \item For any real \emph{a} and \emph{b}, $\Expected(aX + b) = a\Expected(X) + b$.
  \item $ \Expected(X_{1} + X_{2}) = \Expected(X_{1}) + \Expected(X_{2})$
\end{enumerate}

\textbf{\emph{Variance, V(X), and Its Properties}}: \\

\emph{Discrete}

\begin{eqnarray}
\StandardDev_{X}^{2} = \Variance(X) &=&  \Expected(X^{2}) - \Expected(X)^{2} \\ \nonumber \\ %\sum_{\text{all possible x values}} [x - E(X)]^{2} \times p(x)
&=& \sum_{i} x_{i}^{2} p(x) - \left(\sum_{i} x_{i} p(x)\right)^{2}
\end{eqnarray}

\emph{Continuous}

\begin{eqnarray}
\StandardDev_{X}^{2} = \Variance(X) &=&  \Expected(X^{2}) - \Expected(X)^{2} \\ \nonumber \\
&=& \int_{+ \infty}^{- \infty} \! x^{2} \times f(x) \, \mathrm{d}x - \left(\int_{+ \infty}^{- \infty} \! x \times f(x) \, \mathrm{d}x \right )^{2} \\ 
\end{eqnarray}

\emph{General Properties of V(X)}:

\begin{enumerate}
  \item For any real \emph{a} and \emph{b}, $\Variance(aX+b) = a^{2}\Variance(X)$.
  \item If $X_{1}$ and $X_{2}$ are independent
	\begin{enumerate}
	\item $\Variance(X_{1} + X_{2}) = \Variance(X_{1}) + \Variance(X_{2})$
	\end{enumerate}
\item If $X_{1}$ and $X_{2}$ aredependent
	\begin{enumerate}
	\item $\Variance(X_{1} + X_{2}) = \Variance(X_{1}) + \Variance(X_{2}) + 2COV(X_{1},X_{2})$
	\item \emph{Where} $COV(X_{1},X_{2}) = \Expected(X_{1}X_{2}) - \Expected(X_{1})\Expected(X_{2})$
	\end{enumerate}
\end{enumerate}

\emph{ Dealing with $\Expected(X_{1}X_{2})$ from Covariance (COV) Equation}

\begin{enumerate}
  \item If $X_{1}$ and $X_{2}$ are \emph{independent}
\begin{enumerate}
\item $\Expected(X_{1}X_{2}) = \Expected(X_{1})\Expected(X_{2})$
\end{enumerate}
\item If $X_{1}$ and $X_{2}$ are \emph{dependent}
\begin{enumerate}
\item Make a probability table
\item Let $X_{1} = X$ and $X_{2} = Y$
\item $\Expected(XY) =  \sum_{\text{all possible x values}}\sum_{\text{all possible y values}} x \times y \times p_{xy}(x,y)$
\end{enumerate}
\end{enumerate}

\textbf{\emph{Correlation Coefficient, $\Correlation$, and Its Properties}}:

\begin{equation}
\Correlation_{X_{1}X_{2}} = \frac{COV(X_{1},X_{2})}{\StandardDev_{X_{1}}\StandardDev_{X_{2}}} \in [-1, +1]
\end{equation}

\begin{enumerate}
  \item The higher $\Correlation_{X_{1}X_{2}}$, the more positive the correlation
  \item The smaller $\Correlation_{X_{1}X_{2}}$, the more negative the correlation
  \item If $\Correlation_{X_{1}X_{2}} = 0$, there is an absence of linear dependence
\end{enumerate}

\textbf{\emph{Bernouilli Distribution}}: \\

\emph{Properties of a Bernouilli Distribution}

\begin{enumerate}
\item Each trial is a Bernouilli variable.
\begin{enumerate}
\item trial can result in only \emph{one of two outcomes:} 1. Success ($S$) 2. Failure ($F$).
\end{enumerate}
\end{enumerate}

\begin{equation}
\Expected(X) = p
\end{equation}

\begin{equation}
\Variance = p(1-p) = pq
\end{equation}

\textbf{\emph{Binomial Distribution}}: \\

\emph{Properties of a Binomial Distribution}

\begin{enumerate}
  \item \textbf{There are a fixed number of trials, $n$.}
  \item Each trial is a Bernouilli variable - only two possible results.
  \item Outcomes of different trials are $independent$.
\item The probability, $p$, that a trial results in $S$ is the same for each trial.
\item The binomial random variable, $X$, is the \emph{number of successes observed among n, trials}
\begin{enumerate}
\item $X \equiv B(n,p)$
\end{enumerate}
\end{enumerate}

\emph{Binomial Probability, p(x)}:

\begin{eqnarray}
\Probability(X=x) = p(x) &=& P(\text{x successes among n trials}) \nonumber \\ \nonumber \\
&=& \begin{pmatrix}
  n \\
  x \\
 \end{pmatrix}
p^{x} (1 - p)^{n-x} \nonumber \\ \nonumber \\
&=& \left(\frac{n!}{x!(n-x)!} \right )p^{x} (1 - p)^{n-x}
\end{eqnarray}

\emph{Binomial Expected Value, E(X)}: \\

\begin{equation}
\Expected(X) = np
\end{equation}

\emph{Binomial Variance, V(X)}: \\

\begin{equation}
\Variance = np(1-p) = npq
\end{equation}

\textbf{\emph{Geometric Distribution}}: \\

\emph{Properties of a Geometric Distribution}

\begin{enumerate}
  \item The trials are \emph{independent}
\item Each trial is a Bernouilli variable - only two possible results.
\item The probability, $p$, of success is the same for all trials.
\end{enumerate} 

\emph{Geometric Probability, p(x)}: 

\begin{equation}
\Probability(X=x) = p(x) = p(1 - p)^{x-1} \\
\end{equation}

\emph{Geometric Expected Value, E(X)}: \\

\begin{equation}
\Expected(X) = \frac{1}{p}
\end{equation}

\emph{Geometric Variance, V(X)}: \\

\begin{equation}
\Variance = \frac{1-p}{p^{2}} = \frac{q}{p^{2}}
\end{equation}

\textbf{\emph{Standard Normal Distribution (Z Curve)}}: \\

For when you want to find the percent of the population that meets a specified condition and $X$ is approximately a normal distribution with $\Mean = 0$ and $\StandardDev = 1$. Mathematically stated as $ X \equiv N(\Mean, \StandardDev)$.

\begin{equation}
Z = \frac{X-\Mean}{\StandardDev} \nonumber \\
\end{equation}

\emph{Using the Table for a Binomial Distribution}

\begin{enumerate}
\item Table is used when there is a specified range for X in $\Probability(X)$
\item Negative z-score: value is to the left of the mean
\item Positive z-score: value is to the right of the mean
\item Side of Table is ones and tenths place digits for the $z^{*} $ value
\item Top of Table is the Second decimal place in $z^{*}$ value
\item The number at the appropriate intersection of this row and column is $P( Z < z^{*} )$.
\item It is the percent from mean written in decimal form.
\item It is possible to approximate a discrete distribution by a nurmal curve, and thus the normal distribution can be used for approximate probabilities.
\end{enumerate}

\textbf{\emph{Central Limit Theorem}}: Can be considered when the sample average is approximately normally distributed of mean and standard deviation. It's how you simplify a problem solved using binomial distribution. The case where $X = \bar{X}$.\\

\emph{Central Limit Theorem Conditions}

\begin{enumerate}
\item Sample size, $n$, is large: $ n\ge 30$
\item Random Samples are a sequence where each sample is independent and identically distributed (i.i.d.)
\end{enumerate}

\begin{eqnarray}
\text{When} & \bar{X} = \frac{X_{1} + X_{2} + ... + X_{n}}{n} & \text{where } n \ge 30 \nonumber \\
\text{And} & X_{1} + X_{2} + ... + X_{n} & \text{is i.i.d.} \nonumber \\
\text{Then} & \Mean_{\bar{X}} = \Mean & \\
\text{And} & \StandardDev_{\bar{X}} = \frac{\StandardDev}{\sqrt{n}} = \sqrt{\frac{pq}{n}} = \sqrt{\frac{p(1-p)}{n}}
\end{eqnarray}

%%%%%%%%%%%%%%%%%%%
\section{Inferential Statistics}

\emph{Inferential Statistics Objective:} Use sample data to decrease uncertainty about some characteristic of the corresponding population. Such characteristics inlclude: population mean, $\Mean$, or population proportion, $\PopProportion$. \\

\textbf{\emph{Unbiased Statistic}}: When a sample of the complete data set can be used to determine the characteristics of the larger data set.\\

A statistic $\hat{\theta}$ is an unbiased estimate of the population characteristic $\theta$ iff: $\Expected(\hat{\theta}) = \theta$. \\

Let $\hat{p}$ be an unbiased statistic of $p$. And let $\hat{p}$ be the proportion of successes in a random sample size, $n$ from a population whose proportion of success is $p$.

\begin{equation}
\hat{p} =\frac{\text{number of successesin the sample}}{\text{size of the sample}} = \frac{X}{n}
\end{equation}

\begin{equation}
\StandardDev_{\hat{p}} = \sqrt{\frac{p(1-p)}{n}}
\end{equation}

\textbf{\emph{Maximum Likelihood Estimation}}: When the nature of the estimate is unknown, provides estimates gicen a certain parameter. Let $f(x \mid \theta)$ be a density function with some parameter $\theta$.

\begin{equation}
\text{The Set Likelihood is Joint Density Function for all Observations:} f(x_{1}, x_{2}, ... , x_{n} \mid \theta) \nonumber
\end{equation}

\begin{eqnarray}
f(x_{1}, x_{2}, ... , x_{n} \mid \theta) &=& f(x_{1} \mid \theta) \times f(x_{2} \mid \theta) \times ... \times f(x_{3} \mid \theta) \nonumber \\ \nonumber \\
 &=& \prod_{i=1} f(x_{i} \mid \theta) \nonumber
\end{eqnarray}

Assuming that the sample size, $n$, is random and i.i.d., derivitive of the density function with respect to $\theta$ when set to 0 will yield the extimate.

\begin{equation}
\frac{df(x_{1}, x_{2}, ... , x_{n} \mid \theta)}{d\theta} = 0
\end{equation}

\textbf{\emph{Confidence Interval (CI)}}: \\

Let conidence level = $1- \alpha$
It is a percentage.

\begin{equation}
\forall \alpha > 0, P( a \le \theta \le b) = 1 - \alpha
\end{equation}

Can use the standard normal (z) curve areas for the interval. $z^{*} \Rightarrow z_{\alpha/2}$ \\
$\alpha$ is area outside the range and under the curve, divided by 2. \\

\emph{CI methodology to create the equations}

\begin{enumerate}
\item Let $\theta$ term be $Z$ in terms of $\theta$ and adjust bounds accordingly ( subtract $\Expected(\theta)$, divide by $\Variance(\theta)$ )
\item Set bounds equal to $ \pm z^{*}$ term as determined by the range.
\item Solve for $a$ and $b$ interms of  $z^{*}$.
\item Can substitute new $a$ and $b$ terms in original function.
\item Confidence interval allows $z^{*} \Rightarrow z_{\alpha/2}$.
\item If sample size is large, the probability of the set can be substituted for the unbiased estimate. (ex: $p \Rightarrow \hat{p}$)
\item Determine $z_{\alpha/2}$ score using table. (ex: if $1 - \alpha = 95\%; \alpha/2 = 0.05/2 = 0.025; z_{\alpha/2} = z_{0.025}$)
\item Make substitutions for $z_{\alpha/2}$ and $\theta$ and solve.
\end{enumerate}

For a given sample size, $n$, there is a trade-off between the confidence level $1 - \alpha$ and the width of the interval $[a,b]$ around $\theta$. \\

\emph{Substitution Cases for Confidence Interval}: When putting in terms of the Z value.

\begin{enumerate}
\item Proportion, $\hat{p}$
\begin{enumerate}
\item $\Expected(\hat{p}) = p$
\item $\Variance(\hat{p}) = \sqrt{\frac{p(1-p)}{n}}$
\item $P \left( \hat{p} - z_{\alpha/2} \sqrt{\frac{\hat{p}(1-\hat{p})}{n}} \le p \le \hat{p} + z_{\alpha/2} \sqrt{\frac{\hat{p}(1-\hat{p})}{n}} \right ) = 1 - \alpha$
\item Sample size: $n = p(1-p)(\frac{z_{\alpha/2}}{A})^{2}$
\end{enumerate}
\item Mean, $\bar{X}$
\begin{enumerate}
\item $\Expected(\bar{X}) = \Mean$
\item $\Variance(\bar{X}) = \frac{\StandardDev}{\sqrt{n}}$
\item $P\left ( \bar{X} -  z_{\alpha/2} \frac{\hat{\StandardDev}}{\sqrt{n}} \le \Mean \le \bar{X} +  z_{\alpha/2} \frac{\hat{\StandardDev}}{\sqrt{n}} \right ) = 1 - \alpha$
\item Sample size: $n = (\StandardDev \times \frac{z_{\alpha/2}}{A})^{2}$
\end{enumerate}
\end{enumerate}

%%%%%%%%%%%%%%%%%%%
\section{Hypotheses and Test Procedure}

Proving whether some claim or hypothesis about a population characteristic is plausible. \\

\emph{Possible Conclusions}:

\begin{enumerate}
\item Reject $\NullHypothesis$, because sample evidence, $\hat{\theta}$, strongly suggests it is false.
\item Accept $\NullHypothesis$, because cannot disprove.
\end{enumerate}

\emph{Forms of the Alternate Hypothesis ($\AltHypothesis$)}: The null hypothesis, $\NullHypothesis$, is always set equal to $\HypothesisVal$.

\begin{enumerate}
\item $\AltHypothesis$: $\theta > \HypothesisVal$ (upper-tailed test)
\item $\AltHypothesis$: $\theta < \HypothesisVal$ (lower-tailed test)
\item $\AltHypothesis$: $\theta \ne \HypothesisVal$ (two-tailed test)
\end{enumerate}

\emph{Errors in Hypothesis Test}: Risk of error is the consequence of basing the decision on a sample. The only case in which neither error occurs is when $n \rightarrow \infty $.

\begin{enumerate}
\item $\alpha = \Probability(\NullHypothesis \text{ is rejected} \mid \NullHypothesis \text{ is true})$
\item $\beta = \Probability(\NullHypothesis \text{ is not rejected} \mid \NullHypothesis \text{ is false})$
\end{enumerate}

\emph{Hypothesis Test Procedure}: Calculating $\alpha$ (because $\beta$ cannot be calculated). Both cases satisfy the Central Limit Theorem ( $n$ is large ). If the hypothesis test case, [$>, <, or \ne$] are true for the value $k$, $\NullHypothesis$ can be rejected with an error $\alpha$.

\begin{enumerate}
\item Hypothesis test for a population Proportion
\begin{enumerate}
\item Setup: $\alpha = \Probability( \hat{p} [>, <, \ne] k \mid \NullHypothesis \text{ is true.})$
\begin{enumerate}
\item $k = p_{0} + z_{1-\alpha} \sqrt{\frac{p_{0} q_{0}}{n}}$
\end{enumerate}
\item Setup: $\NullHypothesis$: $p_{1} = p_{2}$ versus $\AltHypothesis$: $p_{1} [>, <, \ne] p_{2}$
\begin{enumerate}
\item $k =z_{1-\alpha} \sqrt{\frac{p_{1} q_{1}}{n_{1}} + \frac{p_{2} q_{2}}{n_{2}}}$
\end{enumerate}
\end{enumerate}
\item Hypothesis test for a population Mean
\begin{enumerate}
\item Setup: $\alpha = \Probability( \bar{X}  [>, <, \ne]  k \mid \NullHypothesis \text{ is true.})$
\begin{enumerate}
\item $k = \Mean_{0} + z_{1-\alpha} \left (\frac{\hat{\StandardDev}}{\sqrt{n}} \right )$
\end{enumerate}
\item Setup: $\NullHypothesis$: $\Mean_{1} = \Mean_{2}$ versus $\AltHypothesis$: $\Mean_{1} [>, <, \ne] \Mean_{2}$
\begin{enumerate}
\item $k =z_{1-\alpha} \sqrt{\frac{\hat{\StandardDev}_{1}^{2}}{n_{1}} + \frac{\hat{\StandardDev}_{2}^{2}}{n_{2}}}$
\end{enumerate}
\end{enumerate}
\end{enumerate}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Linear Algebra}

% Each chapter should get a variable definitions section
\section{Variable Definitions}

$\lambda$: eigenvalue - factor by which the eigenvector is scaled when multiplied by the matrix \\

\section{Concept Definitions}

\subsubsection{Linear Algebra}

\textbf{\emph{eigenvectors}} -  for the case of a square matrix, non-zero vectors that, after being multiplied by the square matrix, remain parallel to the original vector \\

%Put definitions of all variables used here, in the following format:
% <variable command>: <name in words> - Definition

% Each topic gets a section in the document
\section{Linear Algebra Basics}

\subsection{Matrix Multiplication}

Let $A$ be an $m$ x $n$ matrix and $B$ be an $n$ x $p$. $C$ is the resultant matrix that is $m$ x $p$ and $r$ is a scalar. $I_{m}$ is an $m$ a $m$ Identity matrix. \\

\emph{Properties of Matrix Multiplication}

\begin{enumerate}
\item $A(BC) = (AB)C$
\item $A(B+C) = AB + AC$
\item $(B + C)A = BA + CA$
\item $r(AB) = (rA)B = A(rB)$
\item $I_{m}A = A = AI_{n}$
\end{enumerate}

\emph{Properties of Matrix Transpose}

\begin{enumerate}
\item $(A^T)^T = A$
\item $(A + B)^T = A^T + B^T$
\item $(rA)^T = rA^T$
\item $(AB)^T = B^{T}A^{T}$
\end{enumerate}

\subsection{Determinant}

The determinant provides important information about a matrix of coefficients of a system of linear equations, or about a matrix that corresponds to a linear transformation of a vector space. \\

\emph{The $2x2$ Matrix}

\begin{eqnarray}
\text{If: } A &=& 
\begin{pmatrix}
  x & x' \\
  y & y' \\
 \end{pmatrix} \nonumber \\
det(A) &=& xy' - yx'
\end{eqnarray}

\emph{The $3x3$ Matrix}: Where $\mid \mid$ denotes that it is the determinate of the matrix.

\begin{eqnarray}
\text{If: } A &=& 
\begin{pmatrix}
  a_{11} & a_{12} & a_{13} \\
  a_{21} & a_{22} & a_{23} \\
  a_{31} & a_{32} & a_{33} \\
 \end{pmatrix} \nonumber \\ \nonumber \\
det(A) &=&  a_{11} 
\begin{vmatrix}
 a_{22} & a_{23} \\
 a_{32} & a_{33} \\
 \end{vmatrix}
+  a_{12}
\begin{vmatrix}
 a_{21} & a_{23} \\
 a_{31} & a_{33} \\
 \end{vmatrix}
+  a_{13}
\begin{vmatrix}
 a_{21} & a_{22} \\
 a_{31} & a_{32} \\
 \end{vmatrix}
 \\ \nonumber \\
det(A) &=&  a_{11}(a_{22}a_{33} - a_{23}a_{32}) + a_{12}((a_{21}a_{33} - a_{23}a_{31}) + a_{13}((a_{21}a_{32} - a_{22}a_{31}) \nonumber 
\end{eqnarray}

\emph{The $n$ x $n$ Matrix where $n > 2$}

\begin{equation}
det(A) = \sum_{j=1}^n (-1)^{1+j} a_{1j} det(A1j)
\end{equation}

\emph{Properties of Determinants}

\begin{enumerate}
\item If $A$ is a triangular matrix, then $det(A)$ is the product of the entries of the main diagonal of A.
\item If $A$ is an $n$ x $n$ matrix, then $det(A^T) = det(A)$
\item If $A$ and $B$ are $n$ x $n$ matrices, then $det(AB) = det(A) \times det(B)$
\item If A is an $n$ x $n$ matrix, then $det(rA) = r^{n}det(A)$
\end{enumerate}

\subsection{Matrix Inverse}
The inverse of a matrix is denoted when it is raised to the -1. \\ \\
\emph{Condition for Matrix Invertability}: If $det(A) \ne 0$, then the matrix is invertible (non-singular). Therefore, the determinant of a matrix can prove whether or not it is invertible. \\

\emph{Properties of Matrix Inverse}

\begin{enumerate}
\item $A^{-1} A = AA^{-1} = I_{n}$
\item $(A^{-1})^{-1} = A$
\item $(AB)^{-1} = B^{-1} A^{-1}$
\item $(A^{T})^{-1} = (A^{-1})^{T}$
\end{enumerate}

\emph{The $2x2$ Matrix}

\begin{eqnarray}
\text{If: } A &=& 
\begin{pmatrix}
  a & b \\
  c & d \\
 \end{pmatrix} \nonumber \\
A^{-1} &=& \frac{1}{det(A)}
\begin{pmatrix}
  d & -b \\
  -c & a \\
 \end{pmatrix}
\end{eqnarray}

\emph{Matrix Inverse Closed Form}: $X$ in the inverse of $A$ and $b$ is the identity matrix.

\begin{eqnarray}
\text{If: } & A^{-1} \text{ exists} \nonumber \\
\text{For: } & AX = b \nonumber \\
\text{Solution: } & X = A^{-1}b \nonumber \\
 AX = b &\Rightarrow AA^{-1} = I_{n}
\end{eqnarray}

\emph{Finding $A^{-1}$}

\begin{enumerate}
\item $det(A) \ne 0$
\item Place $A$ and $I$ side-by-side to form an augmented matrix: $[ A \mid I ]$
\item Transform A into I using elementary row operations.
\item Where $I$ was, now $A^{-1}$ is found.
\end{enumerate}

\emph{Invertible Matrix Theorem}: Let $A$ be a square $n$ x $n$ matrix.

\begin{enumerate}
\item $A$ is an invertible matrix.
\item $A$ is row equivalent to the $n$ x $n$ identity matrix.
\item $A$ has $n$ pivot positions.
\item The columns of $A$ form a linear independent set.
\item The columns of $A$ span $\Re^{n}$.
\item $A^T$ is an invertible matrix.
\end{enumerate}

\subsection{Eigenvectors and Eigenvalues}

\emph{Linear Transformation}: $A$ is a linear transform of vector $x$ when $Ax = \lambda x$.
If the eigenvectors, $x$ are found, the data can be projected into a new space.
Sometimes new information can be found in a new space. \\

Special case: $Ix = x$, where $x$ is any non-zero vector. \\

\emph{Finding the Eigenvalues, $\lambda$ of a Matrix, $A$}: $\lambda$ is only an eigenvalue if $(A - \lambda I)$ is \emph{not} invertible. 

Solve for $\lambda$

\begin{equation}
det(A - \lambda I) = 0
\end{equation}

Any coefficient in front of the $\lambda$ terms are the eigen values.\\
Note: The eigen values are multiples of the determinant of just the matrix. \\
Use the closed form solution ($Ax = \lambda x$) to find the eigenvectors.

\subsection{Trace}

The trace of a matrix is the sum of the eigenvalues. \\
If you find the eigenvalues you can find the trace. \\

Another way to determine the trace is to simply sum the elements on the main diagonal. \\

\begin{equation}
tr(A) = a_{11} + ... + a_{nn} = \sum_{i=1}^n a_{ii}
\end{equation}

\emph{Properties of Matrix Trace}

\begin{enumerate}
\item $tr( A + B) = tr(A) + tr(B)$
\item $tr(rA) = c \times tr(A)$
\item $tr(A) = tr(A^T)$
\item $tr(AB) = tr(BA)$
\item $tr(A^TB) = tr(AB^T) = tr(B^TA) = tr(BA^T)$
\item $tr(ABC) = tr(CAB = tr(BCA)$
\end{enumerate}

\section{Convex Optimization}

\subsection{Convex Optimization Basics}

It is a mathematical optimization problem. The smallest value of the objective function, $f_{0}$, among all vecotrs that satisfy the constraint function, $f_{i}$, is the optimal solution, $x^{*}$. Both  $f_{0}$ and  $f_{i}$ are convex functions. \\

A function, $f_{i}$, is convex if:

\begin{eqnarray}
& f_{i}(\alpha x + \beta y) \le \alpha f_{i}(x) + \beta f_{i}(y) \\ \nonumber \\
\text{Where: }& \forall x, y \in \Re^{n} \nonumber \\
\text{And: }& \forall \alpha, \beta \in \Re^{n} \nonumber \\
\text{With: }& \alpha + \beta = 1 \nonumber \\
\text{And: }& \alpha \ge 0 \nonumber \\
\text{And: }& \beta \ge 0 \nonumber
\end{eqnarray}

\subsection{Lagrange Multipliers}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Linear Regression}

% Each chapter should get a variable definitions section
\section{Variable Definitions}

%Put definitions of all variables used here, in the following format:
% <variable command>: <name in words> - Definition

% Each topic gets a section in the document
\section{Notation}

\begin{enumerate}
\item $m$: number of training examples
\item $ x \in \Re^{n}$: input feature vector of n variables
\item $y$: output variable/target
\item $(x,y)$: training example
\item $(x^{(i)},y^{(i)})$: $i^{th}$ training example
\item $h$: hypothesis that maps fron input $x$ to output $y$
\item $h(x) = \theta_{0} + \theta_{i}x_{1} + ... + \theta_{n}x_{n}$
\item When $x_{0} = 1$ s.t.
\begin{enumerate}
\item $h(x) = h_{\theta}(x) = \sum_{i=0}^n \theta_{i}x_{i} = \theta^Tx$
\end{enumerate}
\item $\theta = \theta_{1},..., \theta_{n}$: parameters of the linear regression
\end{enumerate}

\section{Objective Function}

\begin{equation}
min_{\theta} \frac{1}{2} \sum_{i=1}^m \left ( h_\theta (x^{(i)}) - y^{(i)} \right )^2 = min_{\theta}J(\theta)
\end{equation}

\section{Batch Gradient Descent}

\section{Stochastic Gradient Descent}
When sample size, $n$ is large, Stochastic Gradient Descent is used.
All values are not used therefore computation is faster. \\

\emph{ SGD Methodology}

\begin{enumerate}
\item Random choice of point.
\end{enumerate}

\section{Closed Form Solution}

\begin{equation}
y = 
\begin{bmatrix}
  y^{(1)} \\
 \vdots \\
   y^{(m)} \\
 \end{bmatrix}
\end{equation}

\begin{equation}
X = 
\begin{bmatrix}
  (x^{(1)})^T \\
 \vdots \\
  (x^{(m)})^T  \\
 \end{bmatrix}
\end{equation}

\begin{equation}
\theta = (X^TX)^{-1}X^Ty
\end{equation}

Note: $(X^TX)$ \emph{must be invertible} ($ (X^TX)^{-1}$ can be done.). If not, perform PCA.

\section{Lasso: A Regularized Version of Linear Regression}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\chapter{PCA}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\chapter{Clustering}

\section{Introduction}

Clustering is an attempt to sort data into distinct groups based on the parameters of each data point.

Each point can be described by a vector, where each dimension in the vector is one feature of the data point.

Every clustering algorithm tries to use different metrics of "distance" between different points in this multidimensional space to predict clusters.
Each algorithm approaches the problem differently.

\subsection{Feature Preprocessing}
For most clustering algorithms to work, preprocessing is necessary.
Outliers should be removed, excess dimensions eliminated, missing data dealt with, etc.

Sometimes (such as in $k$-means), each feature needs to be separately processed across all samples. 
This is often done by normalizing each feature to zero mean and unity standard deviation.

Two types of clustering exist: {\emph hard clustering} and {\emph fuzzy clustering}.
In this course, we only considered hard clustering.
Hard clustering is when each data point belongs to exactly one cluster.
On the other hand, fuzzy clustering is when a point has a nonzero probability of belonging to {\it every} cluster.

\section{Proximity}

\subsection{Similarity Measures}

A similarity measure is a mathematical metric that determines how similar two data points are. It is a function $s(x,y)$, where $x$ and $y$ are two data points. Some considerations follow:

$s(x,y)$ must be maximal when $x=y$. The value when this is true is $s(x,y) = s_{0}$.

$s(x,y)$ may {\it only} equal $s_{0}$ when $x=y$.

The slides give different possible functions $s(x,y)$.

\subsection{Dissimilarity Measures}

Dissimilarity measures, like similarity measures, provide a metric for the proximity of points. However, they are minimal for identical points, not maximal. A more formal description follows.

A dissimilarity function is expressed as $d(x,y)$, where $x$ and $y$ are two data points.

$d(x,y)$ must be minimal when $x=y$. The value when this is true is $d(x,y) = d_{o}$.

$d(x,y)$ may {\it only} equal $d_{0}$ when $x = y$.

The slides give different possible functions $d(x,y)$.

\subsection{Proximity of Clusters}

Sometimes, we want to measure the proximity of two clusters, or of a point to a cluster.

In this case, we use the same metrics, but consider a representative point for the cluster.

This point can be calculated (mean, etc), or it can be a representative (min distance, max distance, etc).

\subsection{Sequential Clustering: BSAS}

For this algorithm, a dissimilarity metric is needed.
The metric must measure the dissimilarity between a point and a cluster.
Generally, the representative point used for the cluster is the point that provides the {\it mininmum} value for $d(x,C)$.

\begin{enumerate}
    \item{Inputs: dissimilarity function $d(x,C)$, distance threshold $\Theta$}
    \item{Initialize an initial cluster with one element (choice does not matter)}
    \begin{item}
    For every other sample point $x_{i}$:
        \begin{enumerate}
            \item{find the cluster $C_{k}$ that minimizes $d(x_{i}, C)$}
            \begin{item}
            if $d(x_{i}, C_{k}) \le \Theta$
                \begin{enumerate}
                    \item{TRUE: add $x_{i}$ to $C_{k}$}
                    \item{FALSE: add $x_{i}$ to new cluster}
                \end{enumerate}
            \end{item}
        \end{enumerate}
    \end{item}
    \item{End Algorithm}
\end{enumerate}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%        
%\chapter{Organization of Lab Files}
%
%\section{Lab 01}
%
%\section{Lab 02}
%
%\subsection{Control Structures}
%
%\begin{lstlisting}
%if (condition){
%
%} else {
%
%}
%\end{lstlisting}
%
%\begin{lstlisting}
%for (var in list){
%    # Execute code here
%}
%
%for (i in 1:25){
%    # Execute code here
%}
%
%for (i in seq(from=5, to=25, by=5)){
%    # Execute code here
%}
%\end{lstlisting}
%
%\subsection{Plots and Graphics}
%
%\begin{lstlisting}
%# Typical line plot
%plot(<x_coords>, <y_coords>, type="l", main="<title>", xlab="<x label>", ylab="<y label>", asp=<y/x ratio>, xlim=c(<xmin>,<xmax>), ylim=c(<ymin>,<ymax>))
%
%# Generate pairwise graphs of many different variables
%pairs(name_of_dataframe[column_range_considered])
%
%# Bar plot:
%barplot(<bar_heights>, names.arg=c(<name_1>, <name_2>, ...), main="<title", xlab="<x label>", ylab="<y label>",  ylim=c(<ymin>, <ymax>))
%
%# Adding a line:
%segments(xmin, ymin, xmax, ymax, col="<colorname>")
%\end{lstlisting}
%
%Other plot types are mentioned here too.
%
%\section{Lab 03}
%
%\subsection{Probabilities}
%
%\begin{lstlisting}
%# Generate a number between 0 and n
%x = runif(n)
%
%# Generate an integer between 0 and n
%x = round(runif(n))
%
%# Generate p integers between 0 and n
%x = as.vector(1:p)
%for(i in 1:p){
%   x = round(runif(n)) 
%}
%\end{lstlisting}
%
%\subsection{Discrete Probability Distributions}
%
%\subsection{Useful Statistical Functions}
%
%\begin{lstlisting}
%# Find the mean of a set of numbers
%mean(data)
%
%# Find the variance of a set of numbers
%var(data)
%
%# Find the standard deviation of a set of numbers
%sd(data)
%
%# Find stuff about correlation
%cor(data)
%    
%\end{lstlisting}
%
%There is also code in this section to do it without using these functions, if needed.
%
%\subsection{Binomial and Geometric (and other) Distributions}
%
%Binomial:
%
%\begin{lstlisting}
%# n trials, probability p
%
%# Generate n independent values of X
%rbinom(n,x,p)
%
%# Find P(X = x)
%dbinom(x,n,p)
%
%# Find P(X @$\le$@ x)
%pbinom(x,n,p)
%
%# Find c such that P(X @$\le$@ c) = q
%qbinom(q,n,p)
%
%\end{lstlisting}
%
%Geometric:
%
%\begin{lstlisting}
%# Look in help for:
%dgeom(x,p)
%pgeom(q,p)
%rgeom(n,p)
%\end{lstlisting}
%
%Poisson:
%
%\begin{lstlisting}
%# Look in help for:
%dpois(x, @$\lambda$@)
%ppois(q, @$\lambda$@)
%qpois(p, @$\lambda$@)
%rpois(n, @$\lambda$@)
%\end{lstlisting}
%
%\subsection{Normal Distribution}
%
%\begin{lstlisting}
%# Draw the probability density function of the Z-normal curve
%x = seq(from=-4, to=4, length=200)
%y = dnorm(x, mean=0, sd=1)
%\end{lstlisting}
%
%Pages 12 and 13 have examples of filling areas under the curve. Too large to transcribe here.
%
%\section{Lab 04}
%
%\subsection{Loading Datasets}
%
%\subsection{Confidence Intervals}
%\subsubsection{Point Estimate}
%\subsubsection{Sampling Size Estimate}
%
%\subsection{Hypothesis Testing}
%
%\subsection{Student Distribution?!}
%
%\section{Lab 05}
%
%\subsection{Solving Sets of Linear Equations}
%
%\begin{lstlisting}
%# Ax = b
%x = solve(A, b)
%\end{lstlisting}
%
%\subsection{Constructing Matrices in R (detailed)}
%
%\subsection{Useful Matrix Functions}
%
%\begin{lstlisting}
%# Find Dimensions
%dim(X)
%
%# Find Determinant
%det(X)
%
%# Get Diagonal Elements
%diag(X)
%
%# Find the Trace
%# First, define function
%trace = function(data){
%    return(sum(diag(data)))
%}
%# Now, use it.
%trace(X)
%
%# Matrix Transpose
%t(X)
%
%# Get the Lower or Upper Triangular Matrices
%lower.tri(X)
%upper.tri(X)
%
%# Elementwise Operations
%X + Y
%X * Y
%
%# Matrix Multiplication
%X %*% Y
%
%# Matrix Inversion
%solve(X)
%\end{lstlisting}
%
%\section{Lab 06}
%
%\subsection{Eigenvalues and Eigenvectors}
%
%\begin{lstlisting}
%# Find Eigenvalues and Eigenvectors
%vals = eigen(X)
%eigenvals = vals$values
%eigenvecs = vals$vectors
%\end{lstlisting}
%
%\subsection{Singular Value Decomposition}
%
%\subsection{The "apply" Function}
%
%\section{Lab 07}
%
%\subsection{3D Plots}
%
%\subsection{Principal Component Analysis}
%
%\begin{lstlisting}
%data_pca = princomp(data)
%\end{lstlisting}
%
%DO NOT READ SECTION 2 FOR PCA. IT IS STEP BY STEP INSTEAD OF USING BUILT-IN FUNCTIONS
%
%\subsection{Plotting Principal Component Stuff}
%
%\section{Lab 08}
%
%\subsection{Linear Regressioni (builtin)}
%
%\subsection{Gradient Descent}
%
%\subsection{Closed Form Solution}
%
%\section{Lab 09}
%
%\subsection{Writing k-means}
%
%\section{Lab 10}
%
%\subsection{Wait... k-means is Built In. Damn.}
%
%\subsection{Cluster Validation}
%\subsubsection{Cohesion}
%\subsubsection{Separation}
%
%\subsection{Hierarchial Clustering}
%
\end{document}
